###############################################################################
# Standardize and clean California El Dorado County audit data
###############################################################################
import pandas as pd
import numpy as np
import re

DataFrame = pd.core.frame.DataFrame
Series = pd.core.series.Series


###############################################################################
# Read and shape dataset
###############################################################################
file = '../transcribed/inyo-nov2020-audit.csv'
ballots = pd.read_csv(file, header=33, dtype=str)

#Special shaping
ballots = ballots[:401]

#Get usable specification variable names
ballots = ballots.rename(columns={
    "Jurisdiction Name": "county_name",
    "Container": "container",
    "Tabulator": "tabulator",
    "Batch Name": "batch"
    }, errors="raise")

ballots = ballots.astype(str)
ballots = ballots.replace("nan","BLANK")


###############################################################################
# Data creation functions
###############################################################################
def ReadBallots(series, office):
    #!!! WARNING: ASSUMES CANDIDATES SEPARATED BY "," IN MULTIVOTE ELECTION !!!
    candVotes = {}
    for candList in np.unique(series):
        #List that may contain one or multiple candidates
        cands = candList.split(",")
        for cand in cands:
            if cand[0] == " ":
                cand = cand[1:]
            votes = sum(series.str.contains(cand))
            candVotes[cand] = votes
    return(candVotes)

def CountLiteralDiffs(curr_data, original, audit):
    allCandLists = np.unique(list(curr_data[original])+list(curr_data[audit]))
    allCands = []
    for candList in allCandLists:
        allCands += candList.split(",")
    for i in range(len(allCands)):
        if allCands[i][0] == " ":
            allCands[i] = allCands[i][1:]
    candDiffs = {}
    for cand in allCands:
        candDiffs[cand] = sum(
                              (curr_data[original].str.contains(cand)) & \
                             ~(curr_data[audit].str.contains(cand))
                          )
    return(candDiffs)

#Create a new row with data collected from the individual ballots
def AreaResults(ballots, county, container, tabulator, batch, officeMap):
    #officeMap a dict mapping actual variable names onto desired natural names 

    #Row order:
    #   county
    #   container
    #   tabulator
    #   batch
    #   office
    #   candidate
    #   original vote total
    #   audited vote total
    #   literal diff
    #   margin diff

    rowBase = ["INYO", container, tabulator, batch]
    newRows = []

    #Now the series we're working with should be split by all the specifying
    # data
    relBallots = ballots[(ballots["container"] == container) &
                         (ballots["tabulator"] == tabulator) &
                         (ballots["batch"] == batch)]

    #For every office of interest, get the vote totals of unique candidates
    for rawOffice in officeMap.keys():
        natOffice = officeMap[rawOffice]
        
        auditOffice = "Audit Result: " + rawOffice
        originalOffice = "CVR Result: " + rawOffice
        diffOffice = "Discrepancy: " + rawOffice
        
        #Only include rows in which this contest occurred
        officeBallots = relBallots[relBallots[auditOffice] != \
                                   "CONTEST_NOT_ON_BALLOT"]

        originalVotes = ReadBallots(series = officeBallots[originalOffice], \
                                    office = natOffice)
        auditVotes = ReadBallots(series = officeBallots[auditOffice], \
                                 office = natOffice)
        literalDiffs = CountLiteralDiffs(curr_data = officeBallots,
                                         original = originalOffice,
                                         audit = auditOffice)

        allCandLists = \
                np.unique(list(auditVotes.keys())+list(originalVotes.keys()))
        allCands = []
        for candList in allCandLists:
            allCands += candList.split(",")
        
        #Each candidate that received any votes gets a new row
        for cand in allCands:
            if cand in originalVotes.keys():
                candOriginal = originalVotes[cand]
            else:
                candOriginal = 0
            if cand in auditVotes.keys():
                candAudited = auditVotes[cand]
            else:
                candAudited = 0
            literalDiff = literalDiffs[cand]
            #Note the following calls literal write-in vs. blanks differences
            marginDiff = candAudited - candOriginal
            newRow = rowBase + [natOffice,cand,candOriginal,candAudited,\
                                literalDiff,marginDiff]
            newRows.append(newRow)
    return(newRows)


###############################################################################
# Action
###############################################################################
office_map = {"Governing Board Member Big Pine Unified School": "GOVERNING BOARD BIG PINE SCHOOL",
              "County Supervisor, 4th District": "COUNTY SUPERVISOR 4",
              "Bishop City Treasurer": "BISHOP CITY TREASURER",
              "Big Pine Fire Protection District": "BIG PINE FIRE PROTECTION",
              "Director, Zone 1 Northern Inyo Healthcare District": "HEALTH CARE DIRECTOR 1",
              "Measure P - The Bishop Community Safety And Essential Services Measure": "BISHOP SAFETY SERVICES"
             }
loc_combs = ballots[['container','tabulator','batch']].drop_duplicates()
loc_combs = list(loc_combs.itertuples(index = False, name = None))

data = []
for loc in loc_combs:
    (container, tabulator, batch) = (loc[0], loc[1], loc[2])
    areaRes = AreaResults(ballots = ballots,
                      county = "Inyo County",
                      container = container,
                      tabulator = tabulator,
                      batch = batch,
                      officeMap = office_map)
    for res in areaRes:
        data.append(res)

data = DataFrame(data, columns = ["county","container","tabulator","batch",\
                                 "office","candidate","original_votes",\
                                 "audited_votes","literal_diff",\
                                 "margin_diff"])


###############################################################################
# Final standardization, populate remaining columns, and save
###############################################################################
data["state"] = "CALIFORNIA"
data["date"] = "2020-11-03"
data = data.replace("BLANK", "UNDERVOTES")
data = data.replace("Write-in", "WRITEIN")

#print(data.loc[data["candidate" == "WRITEIN"]])

data.to_csv("../ready/inyo_cleaned.csv", index=False)

